NOTE : All the computations are made on mac laptop with Intel i6 1.4 GHz processor and 16 GB RAM. Also, there was no GPU involved in the computation, all the computation are made using CPU only.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sklearn
import numpy as np
from sklearn.metrics import mean_squared_error
from torch.utils import data as T
import torch
from torchviz import make_dot, make_dot_from_trace
from torchsummary import summary
from torch.autograd import Variable
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import seaborn as sns
import time
import matplotlib.patches as mpatches
import plotly.io as pio
pio.renderers.default = "notebook"
I had taken Apple stock market data from yahoo finance. Data is from 2002 to the current date.
df = pd.read_csv("GME (1).csv")
df.head()
df = df[['Close']]
We are creating 3 different set for training, validation and testing. We will be last two year data for testing, 3rd last year for validation and remaining data for training.
train_data = df[:4027]
validation_data = df[4027:4250]
test_data = df[4250:]
Using MinMax scaler to scale our training and testing data
#initializing scaler object
scaler = MinMaxScaler()
#fitting our training data into scaler object
scaler = scaler.fit(train_data)
#transforming training, testing and validation data using scaler object
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)
validation_data = scaler.transform(validation_data)
Below function creates sequence based on the sequence length provided as the parameters. It will create a new dataset with feature as the number of days of data as the sequence and the target variable as the following day data. Suppose sequence is 3, then the first 3 data will be our X or features and the 4th day data will be our Y or the target variable.
def create_sequences(data, seq_length):
xs = []
ys = []
for i in range(len(data)-seq_length-1):
x = data[i:(i+seq_length)]
y = data[i+seq_length]
xs.append(x)
ys.append(y)
return np.array(xs), np.array(ys)
seq_length = 4
#creating sequential data for training, testing and validation set
X_train, y_train = create_sequences(train_data, seq_length)
X_test, y_test = create_sequences(test_data, seq_length)
X_val, y_val = create_sequences(validation_data, seq_length)
#converting our features and target for all three set into tensors
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
X_val = torch.from_numpy(X_val).float()
y_val = torch.from_numpy(y_val).float()
Below function trains a model on the number of epoch as passed in the function parameter. We are using MSE as the loss function for all the predictions and Adam as the optimizer.
def training(epoch,model,train,validation):
num_epochs = epoch
learning_rate = 0.01
X_train = train[0]
y_train = train[1]
X_val = validation[0]
y_val = validation[1]
#Loss functiom
criterion = torch.nn.MSELoss() # mean-squared error for regression
#Initialization Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(lstm.parameters(), lr=learning_rate)
#Initilizing empty list for storing training loss, test loss and computation time
train_lss = []
val_loss = []
computation_time = []
#Starting the timer
time_start = time.process_time()
# Train the model
for epoch in range(num_epochs):
outputs = model(X_train)
optimizer.zero_grad()
# obtain the loss function
loss = criterion(outputs, y_train)
with torch.no_grad():
y_val_pred = model(X_val)
val_lss = criterion(y_val_pred.float(), y_val)
val_loss.append(val_lss.item())
train_lss.append(loss.item())
loss.backward()
optimizer.step()
if epoch % 20 == 0:
print("Epoch: %d, training loss: %1.5f , validation loss: %1.5f"
% (epoch, loss.item(),val_lss.item()))
#Ending the timer
computation_time.append((time.process_time() - time_start))
#returning training loss, testing loss and the computation time as the output of the function
return train_lss,val_loss,computation_time
Below function plots the training and validation loss which is returned by our training function above. The below function plots two plots one with original y lim and the second plot with y axis zoomed in to (0,0.010).
def plot_TrainTestMSE(epoch,train_loss,test_loss):
blue_patch = mpatches.Patch(color='blue', label='Train MSE')
green_patch = mpatches.Patch(color='orange', label='Validation MSE')
#Using matplotlib subplot to create two subplot and specifying figure size as 14x6
f, axes = plt.subplots(1, 2,figsize=(14,6))
#plotting the first plot
#plotting the train loss
sns.lineplot(x=range(1,epoch+1),y=train_loss,ax=axes[0])
#plotting the testing loss
sns.lineplot(x=range(1,epoch+1),y=test_loss,ax=axes[0])
#specifying the title for first plot
axes[0].title.set_text('Original Plot')
#specifying the xlable and ylabel for first plot
axes[0].set_xlabel("EPOCH")
axes[0].set_ylabel("MSE")
#plotting the first plot
#plotting the train loss
sns.lineplot(x=range(1,epoch+1),y=train_loss,ax=axes[1])
#plotting the testing loss
sns.lineplot(x=range(1,epoch+1),y=test_loss,ax=axes[1])
#specifying the title for second plot
axes[1].title.set_text('Modified Y-axis plot')
#specifying the xylabel for second plot
axes[1].set_xlabel("EPOCH")
#specifying the limit for y axis
axes[1].set_ylim(0,0.010)
axes[0].legend(handles=[blue_patch,green_patch])
axes[1].legend(handles=[blue_patch,green_patch])
#removing the top border of the plot
sns.despine(top=True)
#specifying the super title for both the plot
plt.suptitle("Training and Validation loss");
Function to make prediction. It takes model, and test data as input parameters and return both real target and predicted target variable.
def testPrediction(X_test,y_test,model):
predictions = model(X_test)
#converting both the data predicted and real values into numpy for using it in scaler object
data_real = predictions.data.numpy()
data_predicted = y_test.data.numpy()
#using scaler object as initilized before to inverse transform both the data (predicted and real values)
data_predicted = scaler.inverse_transform(data_predicted)
data_real = scaler.inverse_transform(data_real)
return data_real,data_predicted
Below function plots the real value of our test data and the predicted value from out model which is passed as the parameter.
def plot_test(model,X_test):
#specifying the figure size
X_test = X_test
data_real,data_predict = testPrediction(X_test,y_test,model)
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1,len(X_test)+10)), y=data_real.reshape(441),
mode='lines',
name='Original Data'))
fig.add_trace(go.Scatter(x=list(range(1,len(X_test)+10)), y=data_predict.reshape(441),
mode='lines',
name='Predicted Data'))
fig.update_layout(title_text="Original data points vs predicted data points")
fig.show()
torch.manual_seed(0)
For the task first which is plain backpropogation we are creating a model with 1 hidden layer and a sigmoid activation function. Since we have created sequence of 4 features, we will have 4 input nodes and our output will be 1 node.
class PB(nn.Module):
def __init__(self):
super(PB, self).__init__()
self.fc1 = nn.Linear(4, 16)
self.fc2 = nn.Linear(16, 32)
self.fc3 = nn.Linear(32, 1)
def forward(self, x):
x = torch.sigmoid(self.fc1(x))
x = torch.sigmoid(self.fc2(x))
x = self.fc3(x)
return x
summary(PB(),input_size=(4022,4))
Above is the summary of our model, it describes how much total memory is required to run the model and how much total size of our mode will be after successful training.
"""
We have to reshape the data for our linear model. Since we will pass one dimensional data, and our data is
in 2 dimension. So we will use reshape function to re-shape the data.
Example: [[[1],[2],[3]]] -> shape is (1,3,1)
The above data is 3 dimension, where the first dimension indicates total number of rows in our data, second and third
dimension gives us the dimension of the data passed into the model.
So we have to convert (1,3,1) => (1,3) so that we can pass that into our linear model.
"""
X_train1 = X_train.reshape(4022,4)
X_test1 = X_test.reshape(441,4)
X_val1 = X_val.reshape(218,4)
model = PB()
make_dot(model(X_train1), params=dict(model.named_parameters()))
Above is the flow diagram of our plain backpropogation model and how each layer is linked. As we have see at the top, we have our input layer weights which is passed to Tbackward and then with a bias it is passed to our adam backward function. The result of the adambackward is passed to our activation function. Then the weights of the second layer along with the result of the activation function and bias are passwed to adam optimizer. This process is repeated for all the layers. Finally our output layer weights along with the result of sigmoid activation and a bias are passed to the adam optimizer.
We are first calling the model class and then called our training function with 300 epoch to train the model with the training data and validation data
pb = PB()
train_loss,test_loss,pb_computations = training(300,pb,[X_train1,y_train],[X_val1,y_val])
pb_computations
Above is the result of our plain backpropogation model training and their training and validation loss. From the data, we can observe that the both training and validation were decreasing till 120 epoch and then the validation loss became constant. It took 1.67 seconds to train the model.
plot_TrainTestMSE(300,train_loss,test_loss)
Above plot is plotted between number of epoch and MSE. As we can see from both the plots that the loss decreased heavingly in the first 50 epoch and later it started decreasing slowly. After the first 150 epoch, both the MSE training and validation decreasing very slowly and almost reached 0.001.
data_real,data_predict = testPrediction(X_test1,y_test,pb)
pb_mse = mean_squared_error(data_real,data_predict)
pb_mse
For the plain backpropogation we got mean squared value of 0.2706.
plot_test(pb,X_test1)
Above is the line graph of the original data points and our predicited data points. Our linear model was able to perform well, since it was able to capture the trend of the data.
We will be using a RNN layer followed a linear layer and a output layer. torch.RNN have two inputs, the input data and the hidden state. Details of the inputs are mentioned below
input of shape (seq_len, batch, input_size): tensor containing the features of the input sequence. The input can also be a packed variable length sequence. See torch.nn.utils.rnn.pack_padded_sequence() or torch.nn.utils.rnn.pack_sequence() for details.
h_0 of shape (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. Defaults to zero if not provided. If the RNN is bidirectional, num_directions should be 2, else it should be 1.
class BPTT(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(BPTT, self).__init__()
self.num_classes = num_classes
self.num_layers = num_layers
self.input_size = input_size
self.hidden_size = hidden_size
self.seq_length = seq_length
self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True)
self.fc1 = nn.Linear(hidden_size, 32)
self.fc2 = nn.Linear(32, num_classes)
def forward(self, x):
#we are defining our hidden state of our lstm initially
h_0 = Variable(torch.zeros(
self.num_layers, x.size(0), self.hidden_size))
#defining cell state of our lstm layer
ula, h_out = self.rnn(x, h_0)
h_out = h_out.view(-1, self.hidden_size)
out = torch.relu(self.fc1(h_out))
out = self.fc2(out)
return out
model = BPTT(1,1,2,1)
make_dot(model(X_train), params=dict(model.named_parameters()))
We are first calling the model class and then called our training function with 300 epoch to train the model with the training data and validation data
input_size = 1
hidden_size = 2
num_layers = 1
num_classes = 1
bptt = BPTT(num_classes, input_size, hidden_size, num_layers)
train_loss,test_loss,bptt_computation = training(300,bptt,[X_train,y_train],[X_val,y_val])
bptt_computation
Above is the result of our backpropogation through time model, training and their training and validation loss. From the data, we can observe that the both training and validation were decreasing till 120 epoch and then the validation loss became constant. It took 3.13 miliseconds to train the model which is higher than our previous model.
plot_TrainTestMSE(300,train_loss,test_loss)
data_real,data_predict = testPrediction(X_test,y_test,bptt)
bptt_mse = mean_squared_error(data_real,data_predict)
bptt_mse
We got 0.177 MSE on our test data which has approved significantly in comparison to the previous model.
plot_test(bptt,X_test)
Above is the line graph of the original data points and our predicited data points. Our linear model was able to perform well, since it was able to capture the trend of the data.
LSTM model class
We have defined our LSTM class with input parameters as num of classes, input size, hidden size and number of layers. We are one layer of LSTM followed by a Linear layer.
Following are the inputs of our LSTM layer
input of shape (seq_len, batch, input_size):
tensor containing the features of the input sequence. The input can also be a packed variable length sequence. See torch.nn.utils.rnn.pack_padded_sequence() or torch.nn.utils.rnn.pack_sequence() for details.
h_0 of shape (num_layers * num_directions, batch, hidden_size):
tensor containing the initial hidden state for each element in the batch. If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
c_0 of shape (num_layers * num_directions, batch, hidden_size):
tensor containing the initial cell state for each element in the batch.
If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
class LSTM(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(LSTM, self).__init__()
self.num_classes = num_classes
self.num_layers = num_layers
self.input_size = input_size
self.hidden_size = hidden_size
self.seq_length = seq_length
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True)
self.fc1 = nn.Linear(hidden_size, 64)
self.fc2 = nn.Linear(64, num_classes)
def forward(self, x):
#we are defining our hidden state of our lstm initially
h_0 = Variable(torch.zeros(
self.num_layers, x.size(0), self.hidden_size))
#defining cell state of our lstm layer
c_0 = Variable(torch.zeros(
self.num_layers, x.size(0), self.hidden_size))
ula, (h_out, _) = self.lstm(x, (h_0, c_0))
h_out = h_out.view(-1, self.hidden_size)
out = torch.relu(self.fc1(h_out))
out = self.fc2(out)
return out
model = LSTM(1,1,2,2)
make_dot(model(X_train), params=dict(model.named_parameters()))
input_size = 1
hidden_size = 2
num_layers = 1
num_classes = 1
lstm = LSTM(num_classes, input_size, hidden_size, num_layers)
train_loss,test_loss,lstm_computation = training(300,lstm,[X_train,y_train],[X_val,y_val])
lstm_computation
Above is the result of our lstm model, training and their training and validation loss. From the data, we can observe that the both training and validation were decreasing till 200 epoch and then the validation loss became constant. It took 10.75 miliseconds to train the model which is higher than our previous both model.
plot_TrainTestMSE(300,train_loss,test_loss)
data_real,data_predict = testPrediction(X_test,y_test,lstm)
lstm_mse = mean_squared_error(data_real,data_predict)
lstm_mse
plot_test(lstm,X_test)
Above is the line graph of the original data points and our predicited data points. Our linear model was able to perform well, since it was able to capture the trend of the data.
class GRU(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(GRU, self).__init__()
self.num_classes = num_classes
self.num_layers = num_layers
self.input_size = input_size
self.hidden_size = hidden_size
self.seq_length = seq_length
self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True)
self.fc1 = nn.Linear(hidden_size, 32)
self.fc2 = nn.Linear(32, num_classes)
def forward(self, x):
h_0 = Variable(torch.zeros(
self.num_layers, x.size(0), self.hidden_size))
# Propagate input through LSTM
ula, h_out = self.gru(x, h_0)
h_out = h_out.view(-1, self.hidden_size)
out = self.fc1(h_out)
out = self.fc2(out)
return out
input_size = 1
hidden_size = 2
num_layers = 1
num_classes = 1
gru = GRU(num_classes, input_size, hidden_size, num_layers)
train_loss,test_loss,gru_computation = training(300,gru,[X_train,y_train],[X_val,y_val])
gru_computation
Above is the result of our lstm model, training and their training and validation loss. From the data, we can observe that the both training and validation were decreasing till 200 epoch and then the validation loss became constant. It took 10.75 miliseconds to train the model which is higher than our previous both model.
plot_TrainTestMSE(300,train_loss,test_loss)
data_real,data_predict = testPrediction(X_test,y_test,gru)
gru_mse = mean_squared_error(data_real,data_predict)
gru_mse
plot_test(gru,X_test)
Above is the line graph of the original data points and our predicited data points. Our linear model was able to perform well, since it was able to capture the trend of the data.
Y_computation = [pb_computations[0],bptt_computation[0],lstm_computation[0],gru_computation[0]]
Y_mse = [pb_mse,bptt_mse,lstm_mse,gru_mse]
X = ["PB","BPTT","LSTM","GRU"]
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
go.Scatter(x=X, y=Y_computation, name = "Time Computation (milliseconds)"),
row=1, col=1,
)
fig.add_trace(
go.Scatter(x=X, y=Y_mse,name="Test MSE"),
row=1, col=2
)
fig.update_layout(height=400, width=1000, title_text="Computation time and Test MSE plot")
fig.show()
Above is an interative plot of computation time and test mse of all the models. The first plot show how the computation time varies w.r.t our 4 models. And the second plot shows how the test MSE varies w.r.t to our 4 models.
Plain Backpropagation has the best time computation since it is a simple model with very less amount of computation in comparison with lstm and gru.
We got best test mse in gru model as 0.24. The test mse decreases from PB -> BPTT -> LSTM -> GRU.